Prepare RStudio environment for all tasks to follow.
# Load the necessary packages.
base::require(base)
base::require(knitr)
base::require(markdown)
base::require(testit)
base::require(dplyr)
base::require(reshape2)
base::require(stringr)
base::require(stats)
base::require(ggplot2)
base::require(extrafont)
base::require(gam)
base::require(leaps)
source("https://raw.githubusercontent.com/andkov/psy532/master/scripts/graphs/main_theme.R")
We load the data derived by the script “./reports/data_preparation/dsL_hrs.R”" Variables chosen are age in years, eduction in years, gender, race, if they drink currently, if they smoked ever,
# load a derived dataset
dsL <- readRDS("./data/derived/hrs_drv_dsL.rds")
Load graph settings for creating figures
# load themes used to style graphs
source("./scripts/graphs/graph_themes.R")
Chooses variables for model - End up with model of ~6 or 7 due to conflicting results (seemed reasonable - dropped the least significant factors which were gender, mstot, depressive symptoms and sample weight
#variables used - bmi + rabyear + wtresp+ + shlt + cesd+ conde+ agey+ cogtot+ mstot+ vigact + smokev + drink + shltc
#full
ds.prime<- readRDS("./data/raw/hrs_retirement.rds")
ds <- ds.prime[ds.prime$wave==10, ]
selectVars <- c("hhidpn", "bmi", "conde", "agey", "cogtot", "mstot", "raedyrs", "cesd", "shlt", "ragender", "raracem", "smokev", "drink", "wtresp", "shltc")
ds2 <- ds[,selectVars]
ds3 <- mutate(ds2, shltnum = substring(shlt, 1, 1)) #change the \\D command
ds4 <- dplyr::mutate(ds3, gendernum = substring(ragender, 1, 1)) #decide if needed...
ds5 <- dplyr::mutate(ds4, racenum = substring(raracem, 1, 1))
ds6 <- dplyr::mutate(ds5, smokenum = substring(smokev, 1, 1))
ds7 <- dplyr::mutate(ds6, drinknum = substring(drink, 1, 1))
ds8 <- select(ds7, -shlt)
ds9 <- select(ds8, -ragender)
ds10 <- select(ds9, -raracem)
ds11 <- select(ds10, -smokev)
ds12 <- select(ds11, -drink)
regft.full <- regsubsets(ds12$bmi~., data = ds12, nvmax=18)
summary(regft.full)
Subset selection object
Call: regsubsets.formula(ds12$bmi ~ ., data = ds12, nvmax = 18)
18 Variables (and intercept)
Forced in Forced out
hhidpn FALSE FALSE
conde FALSE FALSE
agey FALSE FALSE
cogtot FALSE FALSE
mstot FALSE FALSE
raedyrs FALSE FALSE
cesd FALSE FALSE
wtresp FALSE FALSE
shltc FALSE FALSE
shltnum2 FALSE FALSE
shltnum3 FALSE FALSE
shltnum4 FALSE FALSE
shltnum5 FALSE FALSE
gendernum2 FALSE FALSE
racenum2 FALSE FALSE
racenum3 FALSE FALSE
smokenum1 FALSE FALSE
drinknum1 FALSE FALSE
1 subsets of each size up to 18
Selection Algorithm: exhaustive
hhidpn conde agey cogtot mstot raedyrs cesd wtresp shltc shltnum2 shltnum3 shltnum4 shltnum5 gendernum2
1 ( 1 ) " " " " "*" " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) " " "*" "*" " " " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) " " "*" "*" " " " " " " " " " " " " " " " " " " " " " "
4 ( 1 ) " " "*" "*" " " " " "*" " " " " " " " " " " " " " " " "
5 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " " " " " " "
6 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " " " " " " "
7 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " " " " " "*"
8 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " " " "*" " " "*"
9 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " " " "*" "*" " " "*"
10 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " "*" "*" "*" " " "*"
11 ( 1 ) " " "*" "*" "*" " " "*" " " " " " " "*" "*" "*" "*" "*"
12 ( 1 ) " " "*" "*" "*" " " "*" " " " " "*" "*" "*" "*" "*" "*"
13 ( 1 ) " " "*" "*" "*" " " "*" " " " " "*" "*" "*" "*" "*" "*"
14 ( 1 ) " " "*" "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" "*"
15 ( 1 ) " " "*" "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" "*"
16 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" "*"
17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
racenum2 racenum3 smokenum1 drinknum1
1 ( 1 ) " " " " " " " "
2 ( 1 ) " " " " " " " "
3 ( 1 ) "*" " " " " " "
4 ( 1 ) "*" " " " " " "
5 ( 1 ) "*" " " " " " "
6 ( 1 ) "*" " " " " "*"
7 ( 1 ) "*" " " " " "*"
8 ( 1 ) "*" " " " " "*"
9 ( 1 ) "*" " " " " "*"
10 ( 1 ) "*" " " " " "*"
11 ( 1 ) "*" " " " " "*"
12 ( 1 ) "*" " " " " "*"
13 ( 1 ) "*" " " "*" "*"
14 ( 1 ) "*" " " "*" "*"
15 ( 1 ) "*" "*" "*" "*"
16 ( 1 ) "*" "*" "*" "*"
17 ( 1 ) "*" "*" "*" "*"
18 ( 1 ) "*" "*" "*" "*"
#Var's of interest - agey, conde,racenum2 (african-american), drinknum1 (yes), cogtot, raedyrs, shltnum4, smokenum1 (yes)
#Variables coefficiants
names(regft.full)
[1] "np" "nrbar" "d" "rbar" "thetab" "first" "last" "vorder" "tol"
[10] "rss" "bound" "nvmax" "ress" "ir" "nbest" "lopt" "il" "ier"
[19] "xnames" "method" "force.in" "force.out" "sserr" "intercept" "lindep" "nullrss" "nn"
[28] "call"
coef(regft.full,18)
(Intercept) hhidpn conde agey cogtot mstot raedyrs cesd
3.881552e+01 5.646507e-10 8.072686e-01 -1.895553e-01 7.727136e-02 3.974560e-02 -1.310662e-01 -1.657343e-02
wtresp shltc shltnum2 shltnum3 shltnum4 shltnum5 gendernum2 racenum2
-2.101632e-06 -2.685605e-01 9.923701e-01 1.354036e+00 1.710996e+00 1.421305e+00 -5.128229e-01 1.171970e+00
racenum3 smokenum1 drinknum1
-2.648775e-01 -2.680000e-01 -4.866865e-01
#coefficiants say...
regfull.sum <- summary(regft.full)
names(regfull.sum)
[1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
par(mfrow=c(2,2))
rsq <-regfull.sum$rsq
plot(rsq,col="purple",xlab= "Num of Vars",ylab="R-Squared", pch=19, type="l")
which.min(rsq)
[1] 1
points(1, rsq[1], pch=4, col="black", lwd=5)
rss <- regfull.sum$rss
plot(rss,col="purple",xlab= "Num of Vars",ylab="RSS" , pch=19)
lines(rss, col="red",lty=2, lwd=1)
which.min(rss)
[1] 18
points(18, rss[18], pch=4, col="black", lwd=5)
cp <- regfull.sum$cp
plot(cp,col="purple",xlab= "Num of Vars",ylab="Cp", pch=19 )
lines(cp, col="red",lty=2, lwd=1)
which.min(cp)
[1] 13
points(13, cp[13], pch=4, col="black", lwd=5)
bic <- regfull.sum$bic
plot(bic,col="purple",xlab= "Num of Vars",ylab="BIC" , pch=19)
lines(bic, col="red",lty=2,lwd=1)
which.min(bic)
[1] 12
points(12, bic[12], pch=4, col="black", lwd=5)
Create Plots of cognition versus BMI (with race as a factor)
#Conde (by race)
plot.1 <- ggplot2::ggplot(data=dsL, aes(x=conde, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(xlab="Number of chronic Health Conditions")+
guides(fill=guide_legend(title="Race"))
plot.1
plot.1a <- plot.1 + facet_grid(. ~ raracem) #may want to take this out....
plot.1a
#age (by race) Graph
plot.2 <- ggplot2::ggplot(data=dsL, aes(x=agey, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Race"))
plot.2
plot.2 + facet_grid(. ~ raracem) #may want to take this out....
#cogtot by race
plot.3 <- ggplot2::ggplot(data=dsL, aes(x=cogtot, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Race"))
plot.3
plot.3 + facet_grid(. ~ raracem) #may want to take this out....
#education in years by race
plot.4 <- ggplot2::ggplot(data=dsL, aes(x=raedyrs, y=bmi, fill=raracem)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("red", "blue", "green"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Race"))
plot.4
plot.4 + facet_grid(. ~ raracem) #may want to take this out....
#Conde (by drinks).
plot.5 <- ggplot2::ggplot(data=dsL, aes(x=conde, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(xlab="Number of chronic Health Conditions")+
guides(fill=guide_legend(title="Drink"))
plot.5
plot.5 + facet_grid(. ~ drink) #may want to take this out....
#age by if drinks
plot.6 <- ggplot2::ggplot(data=dsL, aes(x=agey, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Drink"))
plot.6
plot.6 + facet_grid(. ~ drink) #may want to take this out....
#cogtot by if drinks
plot.7 <- ggplot2::ggplot(data=dsL, aes(x=cogtot, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Drink"))
plot.7
plot.7 + facet_grid(. ~ drink) #may want to take this out....
#education in years by if drinks
plot.8 <- ggplot2::ggplot(data=dsL, aes(x=raedyrs, y=bmi, fill=drink)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("lightgreen", "indianred"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Drink"))
plot.8
plot.8 + facet_grid(. ~ drink) #may want to take this out....
#Conde (by gender) Graph - ideal one...
plot.9 <- ggplot2::ggplot(data=dsL, aes(x=conde, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(xlab="Number of chronic Health Conditions")+
guides(fill=guide_legend(title="Gender"))
plot.9
plot.9 + facet_grid(. ~ ragender) #may want to take this out....
#age (by gender Graph
plot.10 <- ggplot2::ggplot(data=dsL, aes(x=agey, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Gender"))
plot.10
plot.10 + facet_grid(. ~ ragender) #may want to take this out....
#cogtot by gender
plot.11 <- ggplot2::ggplot(data=dsL, aes(x=cogtot, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Gender"))
plot.11
plot.11 + facet_grid(. ~ ragender) #may want to take this out....
#education in years by gender
plot.12 <- ggplot2::ggplot(data=dsL, aes(x=raedyrs, y=bmi, fill=ragender)) +
scale_shape_identity()+
geom_jitter(shape=21, alpha=.5)+
main_theme +
scale_fill_manual(values=c("darkorchid", "dodgerblue3"))+
# labs(color="Race")+
guides(fill=guide_legend(title="Gender"))
plot.12
plot.12 + facet_grid(. ~ ragender) #may want to take this out....
Creates prediction function for models
Creates models of variables of interest
fit.1 <- glm(bmi~conde,data=dsL)
fit.2 <- lm(bmi~poly(conde,2),data=dsL)
fit.3 <- lm(bmi~poly(conde,3),data=dsL)
fit.4 <- lm(bmi~poly(conde,4),data=dsL)
fit.5 <- lm(bmi~poly(conde,5),data=dsL)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) #linear is the way to go...?
Analysis of Deviance Table
Model: gaussian, link: identity
Response: bmi
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev
NULL 9416 294329
conde 1 12985 9415 281344
fit.1 <- lm(bmi~agey,data=dsL)
fit.2 <- lm(bmi~poly(agey,2),data=dsL)
fit.3 <- lm(bmi~poly(agey,3),data=dsL)
fit.4 <- lm(bmi~poly(agey,4),data=dsL)
fit.5 <- lm(bmi~poly(agey,5),data=dsL)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) #simplest way to explain...quadratic
Analysis of Variance Table
Model 1: bmi ~ agey
Model 2: bmi ~ poly(agey, 2)
Model 3: bmi ~ poly(agey, 3)
Model 4: bmi ~ poly(agey, 4)
Model 5: bmi ~ poly(agey, 5)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 9415 278859
2 9414 278825 1 34.151 1.1529 0.2830
3 9413 278800 1 24.797 0.8371 0.3602
4 9412 278797 1 2.895 0.0977 0.7546
5 9411 278761 1 36.072 1.2178 0.2698
fit.1 <- lm(bmi~raedyrs,data=dsL)
fit.2 <- lm(bmi~poly(raedyrs,2),data=dsL)
fit.3 <- lm(bmi~poly(raedyrs,3),data=dsL)
fit.4 <- lm(bmi~poly(raedyrs,4),data=dsL)
fit.5 <- lm(bmi~poly(raedyrs,5),data=dsL)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) #quadratic...
Analysis of Variance Table
Model 1: bmi ~ raedyrs
Model 2: bmi ~ poly(raedyrs, 2)
Model 3: bmi ~ poly(raedyrs, 3)
Model 4: bmi ~ poly(raedyrs, 4)
Model 5: bmi ~ poly(raedyrs, 5)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 9415 292709
2 9414 292521 1 188.573 6.0691 0.01377 *
3 9413 292441 1 79.904 2.5717 0.10883
4 9412 292429 1 11.532 0.3711 0.54240
5 9411 292410 1 19.719 0.6347 0.42567
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit.1 <- lm(bmi~cogtot,data=dsL)
fit.2 <- lm(bmi~poly(cogtot,2),data=dsL)
fit.3 <- lm(bmi~poly(cogtot,3),data=dsL)
fit.4 <- lm(bmi~poly(cogtot,4),data=dsL)
fit.5 <- lm(bmi~poly(cogtot,5),data=dsL)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) #quadratic
Analysis of Variance Table
Model 1: bmi ~ cogtot
Model 2: bmi ~ poly(cogtot, 2)
Model 3: bmi ~ poly(cogtot, 3)
Model 4: bmi ~ poly(cogtot, 4)
Model 5: bmi ~ poly(cogtot, 5)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 9415 293276
2 9414 292787 1 489.14 15.7247 7.38e-05 ***
3 9413 292746 1 41.22 1.3250 0.2497
4 9412 292744 1 2.40 0.0771 0.7812
5 9411 292743 1 1.00 0.0321 0.8579
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Adds models to figures produced above
# attach a line to the graph produced in the previous chunk
#add CI/jitter to lines -- stat.smooth stuff I believe
#after visual examination - cubic chosen for number of health conditions (conde) and for agey (age of participant), linear chosen for education (in years)
#For Condition (by Race)
plot.1ab <- plot.1 + stat_smooth(method="lm", formula= y ~ ns(x,1))
plot.1ab
plot.2ab <- plot.2 + stat_smooth(method="lm", formula= y ~ ns(x,2))
plot.2ab
plot.3ab <- plot.3 + stat_smooth(method="lm", formula= y ~ ns(x,2))
plot.3ab
plot.4ab <- plot.4 + stat_smooth(method="lm", formula= y ~ ns(x,2))
plot.4ab
Data show that the more health conditions you have, the higher your BMI. Given the high co-incidence of obesity (see Patterson et al., 2004) with certain health conditions such as diabetes and heart disease, this is not a surprising finding.
Data show that the older you are, the lower your BMI is. This is not a suprising finding, given that people who live longer tend to have better health, which would ideally correlate with a lower BMI.
Data show that the higher your cognition, the lower your BMI.
Data seem show that the more years of education you have, the higher your BMI. This is somewhat suprising given that conventional wisdom (my bias?) says that education tends to correlate with wealth which correlates with better health. This could either indicate a problem with my conflation of BMI as a measurement of overall health or with differences in the retired population.
Race seems to show that being african american correlates with a higher BMI, this is not a super suprising finding, somewhat expected.
Drinking data seems to show that when participants drink, they are more likely to have a lower BMI. This is a very suprising finding, although it could indicate a problem with their definition of drinking. Unfortunately I was unable to find the operation definition of if a participant drank or not, which could mean either quite a bit of drinking or just a certain number of drinks per week.
Participants rating of their health (self-rated health score) indicates that people who rate their health lower tend to have a higher BMI.
Removes all variables except variables conisdered for models
# # remove all but specified dataset
rm(list=setdiff(ls(), c("dsL")))
dsL <- readRDS("./data/derived/hrs_drv_dsL.rds")